/****************************************************************************
 *C     
 *C     matrix-hl.f
 *C     An example of matrix-matrix multiplication and using PAPI high level
 *C     to look at the performance. written by Kevin London
 *C     March 2000
 *C     Added to c tests to check stop
 *C****************************************************************************
 */
#include "papi_test.h"
#include <stdlib.h>

int main(int argc, char **argv)
{

#define NROWS1 175
#define NCOLS1 225
#define NROWS2 NCOLS1
#define NCOLS2 150
   double p[NROWS1][NCOLS1], q[NROWS2][NCOLS2], r[NROWS1][NCOLS2];
   int i, j, k, num_events, retval;
   /*     PAPI standardized event to be monitored */
   int event[2];
   /*     PAPI values of the counters */
   long_long values[2], tmp;
   extern int TESTS_QUIET;

   tests_quiet(argc, argv);

   /*     Setup default values */
   num_events = 0;

   /*     See how many hardware events at one time are supported
    *     This also initializes the PAPI library */
   num_events = PAPI_num_counters();
   if (num_events < 2) {
      printf("This example program requries the architecture to "
             "support 2 simultaneous hardware events...shutting down.\n");
      test_skip(__FILE__, __LINE__, "PAPI_num_counters", 1);
   }

   if (!TESTS_QUIET)
      printf("Number of hardware counters supported: %d\n", num_events);

   if (PAPI_query_event(PAPI_FP_INS) != PAPI_OK)
      event[0] = PAPI_TOT_INS;
   else
      event[0] = PAPI_FP_INS;

   /*     Time used */
   event[1] = PAPI_TOT_CYC;

   /*     matrix 1: read in the matrix values */
   for (i = 0; i < NROWS1; i++)
      for (j = 0; j < NCOLS1; j++)
         p[i][j] = i * j * 1.0;

   for (i = 0; i < NROWS2; i++)
      for (j = 0; j < NCOLS2; j++)
         q[i][j] = i * j * 1.0;

   for (i = 0; i < NROWS1; i++)
      for (j = 0; j < NCOLS2; j++)
         r[i][j] = i * j * 1.0;

   /*     Set up the counters */
   num_events = 2;
   retval = PAPI_start_counters(event, num_events);
   if (retval != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_start_counters", retval);

   /*     Clear the counter values */
   retval = PAPI_read_counters(values, num_events);
   if (retval != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_read_counters", retval);

   /*     Compute the matrix-matrix multiplication  */
   for (i = 0; i < NROWS1; i++)
      for (j = 0; j < NCOLS2; j++)
         for (k = 0; k < NCOLS1; k++)
            r[i][j] = r[i][j] + p[i][k] * q[k][j];

   /*     Stop the counters and put the results in the array values  */
   retval = PAPI_stop_counters(values, num_events);
   if (retval != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_stop_counters", retval);

   /*  Make sure the compiler does not optimize away the multiplication
    *  with dummy(r);
    */
#if ((defined(__ALPHA) && defined(__osf__)) || defined(PPC64))
   dummy(r);
#endif

   if (!TESTS_QUIET) {
      if (event[0] == PAPI_TOT_INS) {
         printf(TAB1, "TOT Instructions:", values[0]);
      } else {
         printf(TAB1, "FP Instructions:", values[0]);
      }
      printf(TAB1, "Cycles:", values[1]);
   }

   /*  
    *  Intel Core overreports flops by 50% when using -O
    *  Use -O2 or -O3 to produce the expected # of flops
    */

   if (event[0] == PAPI_FP_INS) {
      /*     Compare measured FLOPS to expected value */
      tmp = 2 * (long_long) (NROWS1) * (long_long) (NCOLS2) * (long_long) (NCOLS1);
      if (abs((int)values[0] - (int)tmp) > tmp * 0.05) {
         /*     Maybe we are counting FMAs? */
         tmp = tmp / 2;
         if (abs((int)values[0] - (int)tmp) > tmp * 0.05) {
            printf("\n" TAB1, "Expected operation count: ", 2 * tmp);
            printf(TAB1, "Or possibly (using FMA):  ", tmp);
            printf(TAB1, "Instead I got:            ", values[0]);
            test_fail(__FILE__, __LINE__,
                      "Unexpected FLOP count (check vector operations)", 1);
         }
      }
   }
   test_pass(__FILE__, 0, 0);
   return (PAPI_EMISC);
}
